Environment Set-up

#is_gluonts_activated()
#get_python_env()
#check_gluonts_dependencies()
#detect_default_gluonts_env()
#remotes::install_github("business-science/modeltime.gluonts")
#modeltime.gluonts::install_gluonts()
# GluonTS Installation - Run 1st time
#modeltime.gluonts::install_gluonts(fresh_install = T, include_pytorch = F)
library(modeltime.gluonts)
## Loading required package: modeltime
## Warning: package 'modeltime' was built under R version 4.1.2
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.1.2
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip
## -- Attaching packages -------------------------------------- tidymodels 0.1.4 --
## v broom        0.7.11     v recipes      0.1.17
## v dials        0.0.10     v rsample      0.1.1 
## v dplyr        1.0.7      v tibble       3.1.6 
## v ggplot2      3.3.5      v tidyr        1.1.4 
## v infer        1.0.0      v tune         0.1.6 
## v modeldata    0.1.1      v workflows    0.2.4 
## v parsnip      0.1.7      v workflowsets 0.1.0 
## v purrr        0.3.4      v yardstick    0.0.9
## Warning: package 'dials' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'infer' was built under R version 4.1.2
## Warning: package 'modeldata' was built under R version 4.1.2
## Warning: package 'parsnip' was built under R version 4.1.2
## Warning: package 'recipes' was built under R version 4.1.2
## Warning: package 'rsample' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'tune' was built under R version 4.1.2
## Warning: package 'workflows' was built under R version 4.1.2
## Warning: package 'workflowsets' was built under R version 4.1.2
## Warning: package 'yardstick' was built under R version 4.1.2
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v readr   2.1.1     v forcats 0.5.1
## v stringr 1.4.0
## Warning: package 'readr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()
library(timetk)
## Warning: package 'timetk' was built under R version 4.1.2
library(modeltime)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(mice)
## Warning: package 'mice' was built under R version 4.1.2
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

Project

product_ts_cmb = read.csv("product_ts_full.csv",na.strings = " ")
product_ts_cmb$ISOWEEKS = as.Date(product_ts_cmb$ISOWEEKS)
product_ts_cmb$CATEGORY = as.factor(product_ts_cmb$CATEGORY)
head(product_ts_cmb)
##   WEEK   ISOWEEKS YEARWEEK    SUM PRICE PROMO         CATEGORY
## 1    1 1989-01-01  1989-00 177538 0.575     1 FrontEnd Candies
## 2    1 1989-01-01  1989-00 170067 0.950     1     Frozen Juice
## 3    1 1989-01-01  1989-00 138771 1.880     1    Bottled Juice
## 4    1 1989-01-01  1989-00 478899 1.990     1       Soft Drink
## 5    2 1989-01-08  1989-01 174278 0.575     1 FrontEnd Candies
## 6    2 1989-01-08  1989-01 140251 1.040     1     Frozen Juice
data <- product_ts_cmb %>%
  select(CATEGORY, ISOWEEKS,PRICE ,PROMO, SUM) %>%
  group_by(CATEGORY) %>%
  ungroup()
data %>%
  group_by(CATEGORY) %>%
  plot_time_series(
    ISOWEEKS, 
    SUM, 
    .facet_ncol = 3, 
    .interactive = FALSE
  )

data %>%
  group_by(CATEGORY) %>%
  plot_time_series(
    ISOWEEKS, 
    PRICE, 
    .facet_ncol = 3, 
    .interactive = FALSE
  )

HORIZON <- 52

new_data <- data %>%
  group_by(CATEGORY) %>%
  future_frame(.length_out = HORIZON) %>%
  ungroup()
## .date_var is missing. Using: ISOWEEKS

Data processing - Training and test

#summary Stats
pander::pander(summary(data))
Table continues below
CATEGORY ISOWEEKS PRICE PROMO
Bath Soap :265 Min. :1989-01-01 Min. :0.000 Min. :0.0000
Beer :303 1st Qu.:1991-05-26 1st Qu.:0.990 1st Qu.:1.0000
Bottled Juice :392 Median :1993-02-07 Median :1.290 Median :1.0000
FrontEnd Candies:396 Mean :1993-01-26 Mean :1.567 Mean :0.9911
Frozen Juice :396 3rd Qu.:1994-12-02 3rd Qu.:1.970 3rd Qu.:1.0000
Soft Drink :390 Max. :1996-08-18 Max. :6.315 Max. :1.0000
SUM
Min. : 0
1st Qu.: 63770
Median : 134317
Mean : 228657
3rd Qu.: 200924
Max. :1605204

Selecting Category for model training

#input category
filtereddata = filter(data,CATEGORY == 'FrontEnd Candies')
#filtereddata = data

PMM Imputation

#Replacing all zeroes to NA:
filtereddata$SUM[filtereddata$SUM == 0] <- NA
filtereddata$PRICE[filtereddata$PRICE == 0] <- NA
imp <- mice(filtereddata,m=1,maxit=50,meth='pmm',seed=500)
## 
##  iter imp variable
##   1   1
##   2   1
##   3   1
##   4   1
##   5   1
##   6   1
##   7   1
##   8   1
##   9   1
##   10   1
##   11   1
##   12   1
##   13   1
##   14   1
##   15   1
##   16   1
##   17   1
##   18   1
##   19   1
##   20   1
##   21   1
##   22   1
##   23   1
##   24   1
##   25   1
##   26   1
##   27   1
##   28   1
##   29   1
##   30   1
##   31   1
##   32   1
##   33   1
##   34   1
##   35   1
##   36   1
##   37   1
##   38   1
##   39   1
##   40   1
##   41   1
##   42   1
##   43   1
##   44   1
##   45   1
##   46   1
##   47   1
##   48   1
##   49   1
##   50   1
## Warning: Number of logged events: 1
#get back the completed dataset
filtereddata <- complete(imp,1)
pander::pander(summary(filtereddata))
Table continues below
CATEGORY ISOWEEKS PRICE PROMO
Bath Soap : 0 Min. :1989-01-01 Min. :0.3300 Min. :0.0000
Beer : 0 1st Qu.:1990-11-23 1st Qu.:0.4500 1st Qu.:1.0000
Bottled Juice : 0 Median :1992-10-14 Median :0.4900 Median :1.0000
FrontEnd Candies:396 Mean :1992-10-18 Mean :0.5376 Mean :0.9823
Frozen Juice : 0 3rd Qu.:1994-09-12 3rd Qu.:0.6162 3rd Qu.:1.0000
Soft Drink : 0 Max. :1996-08-18 Max. :2.0000 Max. :1.0000
SUM
Min. : 93343
1st Qu.:148348
Median :172761
Mean :178825
3rd Qu.:200363
Max. :473181
print(acf(filtereddata$SUM,lag.max = 156))

## 
## Autocorrelations of series 'filtereddata$SUM', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.485  0.473  0.409  0.504  0.388  0.349  0.405  0.296  0.320  0.291 
##     11     12     13     14     15     16     17     18     19     20     21 
##  0.310  0.229  0.273  0.245  0.280  0.233  0.185  0.176  0.153  0.148  0.159 
##     22     23     24     25     26     27     28     29     30     31     32 
##  0.133  0.138  0.101  0.082  0.118  0.091  0.148  0.093  0.169  0.092  0.164 
##     33     34     35     36     37     38     39     40     41     42     43 
##  0.109  0.172  0.140  0.193  0.153  0.125  0.164  0.143  0.222  0.165  0.210 
##     44     45     46     47     48     49     50     51     52     53     54 
##  0.148  0.236  0.199  0.234  0.232  0.219  0.261  0.281  0.301  0.230  0.236 
##     55     56     57     58     59     60     61     62     63     64     65 
##  0.265  0.222  0.200  0.190  0.175  0.164  0.121  0.160  0.137  0.134  0.093 
##     66     67     68     69     70     71     72     73     74     75     76 
##  0.125  0.073  0.075  0.066  0.063  0.104  0.023  0.048 -0.007  0.099  0.011 
##     77     78     79     80     81     82     83     84     85     86     87 
##  0.018  0.051  0.063  0.015  0.004  0.064 -0.005 -0.006  0.001  0.063  0.003 
##     88     89     90     91     92     93     94     95     96     97     98 
##  0.011  0.030  0.046  0.054  0.055  0.063  0.073  0.052  0.091  0.090  0.108 
##     99    100    101    102    103    104    105    106    107    108    109 
##  0.060  0.133  0.112  0.133  0.139  0.169  0.170  0.136  0.147  0.099  0.068 
##    110    111    112    113    114    115    116    117    118    119    120 
##  0.088  0.095  0.047  0.040  0.036  0.052 -0.018 -0.020 -0.024 -0.015 -0.048 
##    121    122    123    124    125    126    127    128    129    130    131 
## -0.057 -0.102 -0.104 -0.103 -0.111 -0.075 -0.123 -0.089 -0.160 -0.082 -0.135 
##    132    133    134    135    136    137    138    139    140    141    142 
## -0.120 -0.060 -0.057 -0.077 -0.102 -0.052 -0.112 -0.082 -0.089 -0.045 -0.042 
##    143    144    145    146    147    148    149    150    151    152    153 
## -0.029 -0.059 -0.062 -0.051 -0.020 -0.026 -0.035 -0.043 -0.031 -0.017 -0.032 
##    154    155    156 
## -0.003  0.004  0.029
print(acf(filtereddata$PRICE,lag.max = 156))

## 
## Autocorrelations of series 'filtereddata$PRICE', by lag
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  1.000  0.667  0.584  0.431  0.457  0.430  0.410  0.428  0.390  0.451  0.418 
##     11     12     13     14     15     16     17     18     19     20     21 
##  0.440  0.389  0.433  0.438  0.440  0.433  0.377  0.344  0.354  0.369  0.356 
##     22     23     24     25     26     27     28     29     30     31     32 
##  0.292  0.298  0.303  0.309  0.316  0.287  0.271  0.265  0.288  0.263  0.269 
##     33     34     35     36     37     38     39     40     41     42     43 
##  0.270  0.303  0.285  0.287  0.276  0.254  0.263  0.218  0.189  0.177  0.202 
##     44     45     46     47     48     49     50     51     52     53     54 
##  0.179  0.188  0.230  0.268  0.271  0.290  0.298  0.286  0.242  0.224  0.206 
##     55     56     57     58     59     60     61     62     63     64     65 
##  0.189  0.187  0.132  0.167  0.169  0.199  0.191  0.227  0.228  0.241  0.195 
##     66     67     68     69     70     71     72     73     74     75     76 
##  0.191  0.146  0.124  0.111  0.109  0.105  0.093  0.073  0.066  0.072  0.079 
##     77     78     79     80     81     82     83     84     85     86     87 
##  0.075  0.066  0.068  0.048  0.043  0.043  0.056  0.050  0.061  0.044  0.042 
##     88     89     90     91     92     93     94     95     96     97     98 
##  0.032  0.045  0.037 -0.021 -0.019 -0.038 -0.028 -0.009  0.000 -0.006 -0.006 
##     99    100    101    102    103    104    105    106    107    108    109 
## -0.014 -0.025 -0.032 -0.031 -0.034 -0.046 -0.059 -0.068 -0.087 -0.058 -0.054 
##    110    111    112    113    114    115    116    117    118    119    120 
## -0.042 -0.067 -0.066 -0.055 -0.041 -0.041 -0.062 -0.063 -0.063 -0.067 -0.087 
##    121    122    123    124    125    126    127    128    129    130    131 
## -0.068 -0.070 -0.068 -0.077 -0.065 -0.055 -0.061 -0.061 -0.065 -0.060 -0.063 
##    132    133    134    135    136    137    138    139    140    141    142 
## -0.066 -0.077 -0.081 -0.094 -0.079 -0.053 -0.030 -0.022 -0.037 -0.059 -0.055 
##    143    144    145    146    147    148    149    150    151    152    153 
## -0.067 -0.081 -0.083 -0.082 -0.058 -0.054 -0.071 -0.092 -0.089 -0.082 -0.065 
##    154    155    156 
## -0.069 -0.075 -0.083

Feature Engineering

recipe_spec <- recipe(SUM ~., filtereddata) %>%
    step_timeseries_signature(ISOWEEKS) %>%
    step_rm(matches("(iso$)|(xts$)|(day)|(hour)|(min)|(sec)|(am.pm)")) %>%
    step_mutate(Date_week = factor(ISOWEEKS_week, ordered = TRUE)) %>%
    step_lag(SUM, lag = 1, default=1) %>%
    step_lag(PRICE, lag = 1, default=1) %>%
    step_lag(PROMO, lag = 1, default=1) %>%
    step_dummy(all_nominal(),keep_original_cols = T) %>%
    step_normalize(contains("index.num"), ISOWEEKS_year) %>%
    step_holiday_signature(ISOWEEKS,
                           holiday_pattern = "^US_",
                           locale_set      = "US"
                           )

recipe_spec %>% prep() %>% juice()
## # A tibble: 396 x 111
##    CATEGORY         ISOWEEKS   PRICE PROMO    SUM ISOWEEKS_index.~ ISOWEEKS_year
##    <fct>            <date>     <dbl> <int>  <int>            <dbl>         <dbl>
##  1 FrontEnd Candies 1989-01-01 0.575     1 177538            -1.72         -1.50
##  2 FrontEnd Candies 1989-01-08 0.575     1 174278            -1.71         -1.50
##  3 FrontEnd Candies 1989-01-15 0.625     1 169752            -1.70         -1.50
##  4 FrontEnd Candies 1989-01-22 0.7       1 172676            -1.69         -1.50
##  5 FrontEnd Candies 1989-01-29 0.7       1 167725            -1.69         -1.50
##  6 FrontEnd Candies 1989-02-05 0.425     1 169454            -1.68         -1.50
##  7 FrontEnd Candies 1989-02-12 0.4       0 137547            -1.67         -1.50
##  8 FrontEnd Candies 1989-02-19 0.575     1 133348            -1.66         -1.50
##  9 FrontEnd Candies 1989-02-26 0.575     1 154926            -1.65         -1.50
## 10 FrontEnd Candies 1989-03-05 0.575     1 174732            -1.64         -1.50
## # ... with 386 more rows, and 104 more variables: ISOWEEKS_half <int>,
## #   ISOWEEKS_quarter <int>, ISOWEEKS_month <int>, ISOWEEKS_month.lbl <ord>,
## #   ISOWEEKS_mweek <int>, ISOWEEKS_week <int>, ISOWEEKS_week2 <int>,
## #   ISOWEEKS_week3 <int>, ISOWEEKS_week4 <int>, Date_week <ord>,
## #   lag_1_SUM <dbl>, lag_1_PRICE <dbl>, lag_1_PROMO <dbl>, CATEGORY_Beer <dbl>,
## #   CATEGORY_Bottled.Juice <dbl>, CATEGORY_FrontEnd.Candies <dbl>,
## #   CATEGORY_Frozen.Juice <dbl>, CATEGORY_Soft.Drink <dbl>, ...

Split and Settings

splits<- time_series_split(
filtereddata,
assess = "2 weeks",#input forecast horizon
cumulative = TRUE
)
## Using date_var: ISOWEEKS
splits %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(ISOWEEKS, SUM, .interactive = T,.facet_ncol= 2,)
#split settings
future = nrow(testing(splits))+1

Seed Settings

mxnet <- reticulate::import("mxnet")
mxnet$random$seed <- 123

Model 1: ARIMA

# Model 1: auto_arima ----
model_fit_arima_no_boost <- arima_reg() %>%
    set_engine(engine = "auto_arima") %>%
    fit(SUM ~ ., data = training(splits))
## frequency = 13 observations per 1 quarter

Model 2: Exponential Smoothing

# Model 2 ets ----
model_fit_ets <- exp_smoothing(
  #seasonal_period  = 52,
  error            = "multiplicative",
  trend            = "additive",
  season           = "multiplicative"
  ) %>%
    set_engine(engine = "ets") #%>%
    #fit(SUM ~ ., data = training(splits))

Model 3: Prophet

#prophet
model_spec_prophet = prophet_reg(
seasonality_weekly=TRUE,
seasonality_yearly = F,
season = "multiplicative"
)

Model 4: Random Forest

#random forest
model_spec_rf <- rand_forest(mode = "regression",trees = 500, min_n = 50) %>%
set_engine("randomForest")

Model 5: DeepAR - LSTM

#deepar_lstm
model_spec_deepar_lstm = deep_ar(
id = "CATEGORY",
freq = "W",
prediction_length = future,
lookback_length = 3*HORIZON,
num_layers = 1,
epochs = 5,
scale = F
) %>%
set_engine("gluonts_deepar")

Model 5C: DeepAR - GRU

#deepar_gru
model_spec_deepar_gru = deep_ar(
id = "CATEGORY",
freq = "W",
prediction_length = future,
lookback_length = 3*HORIZON,
num_layers = 1,
epochs = 5,
cell_type = "gru",
scale = F
) %>%
set_engine("gluonts_deepar")

Fitted Workflow

#ARIMA
#wflw_fit_arima_no_boost <- workflow() %>%
#add_model(model_fit_arima_no_boost) %>%
#add_recipe(recipe_spec %>% step_rm(all_nominal()) ) %>%
#fit(training(splits))



#ETS
wflw_fit_ets <- workflow() %>%
add_model(model_fit_ets) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
## frequency = 13 observations per 1 quarter
#Prophet
wflw_fit_prophet <- workflow() %>%
add_model(model_spec_prophet) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
## Warning: Engine set to `prophet`.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
#random forest
workflow_fit_rf <- workflow() %>%
add_model(model_spec_rf) %>%
add_recipe(recipe_spec %>% step_rm("ISOWEEKS")) %>%
fit(training(splits))

#deepar_lstm
wflw_fit_deepar_lstm <- workflow() %>%
add_model(model_spec_deepar_lstm) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))

#deepar_gru
wflw_fit_deepar_gru <- workflow() %>%
add_model(model_spec_deepar_gru) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))

Sub Model Evaluation

submodels_tbl <- modeltime_table(

#wflw_fit_arima_no_boost,
model_fit_arima_no_boost,
wflw_fit_ets,
wflw_fit_prophet,
wflw_fit_deepar_lstm,
wflw_fit_deepar_gru,
workflow_fit_rf
)



submodels_tbl %>%
modeltime_accuracy(testing(splits)) %>%
table_modeltime_accuracy(.interactive = FALSE)
Accuracy Table
.model_id .model_desc .type mae mape mase smape rmse rsq
1 REGRESSION WITH ARIMA(0,1,1) ERRORS Test 3407.03 2.48 0.51 2.50 3563.17 1
2 ETS(M,AD,M) Test 4951.54 3.62 0.75 3.65 5104.60 1
3 PROPHET W/ REGRESSORS Test 13194.58 9.54 1.99 10.18 15776.37 1
4 DEEPAR Test 3108.36 2.32 0.47 2.29 3620.94 1
5 DEEPAR Test 3998.95 2.91 0.60 2.96 4354.12 1
6 RANDOMFOREST Test 11475.13 8.49 1.73 8.12 11897.19 1
submodels_tbl %>%
modeltime_forecast(
new_data = testing(splits),
actual_data = filtereddata,
keep_data = T
) %>% group_by(CATEGORY) %>%
plot_modeltime_forecast(
.interactive = T,
.facet_ncol = 2,
.legend_max_width = 8)
## Warning: Expecting the following names to be in the data frame: .conf_hi, .conf_lo. 
## Proceeding with '.conf_interval_show = FALSE' to visualize the forecast without confidence intervals.
## Alternatively, try using `modeltime_calibrate()` before forecasting to add confidence intervals.